In [1]:
import pandas as pd
import numpy as np

# Time
from datetime import datetime, timedelta

# Maps and geo
import geopy.distance as dist
import folium
from folium.plugins import HeatMapWithTime

# Visualization
from plotly import graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns
import matplotlib.pyplot as plt

# Preprocessing
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler, StandardScaler

# Models
from sklearn import cluster
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from catboost import CatBoostClassifier

# CV and optimization
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score
from bayes_opt import BayesianOptimization # docs are here https://github.com/fmfn/BayesianOptimization

# Metrics
from sklearn.metrics import classification_report, accuracy_score

# Feature importance 
import shap

# Serialization
import joblib

# Fix random_state for experiments reproducibility
RANDOM_STATE = 42
In [2]:
df = pd.read_csv('robotex5.csv')
In [3]:
# Since we work with time data it's convenient to use sorted values
df.sort_values(by='start_time', inplace=True)
df.reset_index(drop=True, inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 627210 entries, 0 to 627209
Data columns (total 6 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   start_time  627210 non-null  object 
 1   start_lat   627210 non-null  float64
 2   start_lng   627210 non-null  float64
 3   end_lat     627210 non-null  float64
 4   end_lng     627210 non-null  float64
 5   ride_value  627210 non-null  float64
dtypes: float64(5), object(1)
memory usage: 28.7+ MB

1. EDA¶

Time and date¶

In [4]:
# Convert to datetime and get separately: date, hour, minute and day of the week which might help us in predictions
df['start_time'] = pd.to_datetime(df['start_time'])
df['date'] = df['start_time'].dt.date
df['hour'] = df['start_time'].dt.hour
df['minute'] = df['start_time'].dt.minute
df['day_of_the_week'] = df['start_time'].dt.dayofweek
In [5]:
# Make bins of 30 min intervals. I assume that 30 min is the time required to place an order, wait for pickup and complete a ride within Tallin.
# It would be nice to have actual average time of the ride, but given the data we need to make the assumption.
# The bins will be used for further demand and gain calculations.
df['start_time_bin'] = df['start_time'].apply(lambda x: x.floor('30Min'))
df['end_time_bin'] = df['start_time_bin'].apply(lambda x: x + timedelta(minutes=30))

Coordinates¶

In [6]:
# We can calculate the distances between given coordinates. It would be nice to use an external API (i.e. Google Maps) to have more accurate road
# distances but since it's not free we can use simple "flying" distances for now. They will be used to calculate gains.

def distance(row, lat1:str, lng1:str, lat2:str, lng2:str) -> float:
    """The function calculates distance between 2 given coordinates
       using geopy package (input lat and lng must be in degrees)"""

    distance = dist.geodesic((row[lat1], row[lng1]), (row[lat2], row[lng2])).km

    return distance
In [7]:
df = df.assign(distance=df.apply(lambda x: distance(x, 'start_lat', 'start_lng', 'end_lat', 'end_lng'), axis=1))
In [8]:
# End coordinates have some unrealistic values, let's clean them up. 
# The idea is to be aligned with start location (so coordinates within the same max, min range +- std)
df = df[(df['end_lat'] > df['start_lat'].min() - df['start_lat'].std()) & 
        (df['end_lat'] < df['start_lat'].max() + df['start_lat'].std()) &
        (df['end_lng'] > df['start_lng'].min() - df['start_lng'].std()) & 
        (df['end_lng'] < df['start_lng'].max() + df['start_lng'].std())]

Clustering zones¶

In [9]:
# Let's create a hourly grouped heat map with all starting positions in order to visually estimate potential zones 
# clusters as well as possible errors. Folium library is used to create and plot the map.

def map_builder(heat:list) -> folium.Map():
    """This function builds the heatmap based on starting order locations list of lists"""

    # Map settings
    m = folium.Map(location=[59.44, 24.75], tiles='OpenStreetMap' , zoom_start=11) # Tallin center location

    # Heat map
    HeatMapWithTime(heat, auto_play=False, index=[i for i in range(len(heat))], radius=5).add_to(m)
 
    return m
In [10]:
# Group lists by hours
heat = []
for i in df['hour'].unique():
    # List of start coordinates lists within hour i
    hour_group = df[df['hour'] == i][['start_lat', 'start_lng']].apply(lambda x: x.tolist(), axis=1).tolist()
    heat.append(hour_group)
In [11]:
# Map
# NOTE: ADJUST THE SLIDER IN THE BOTTOM TO GET HEATMAP FOR SPECIFIC HOUR
map_builder(heat)
Out[11]:
Make this Notebook Trusted to load map: File -> Trust Notebook

Clustering zones with k-means¶

In [12]:
# The visual map analysis gives us an idea that there can be 6-9 clusters (zones), however, we cannot decide on 
# number of zones just using the map, but we can see that the zones are most expressed and presented at 18:00 
# let's use the the sample of the data with all start points at 18:00 and try to split the data into zones
In [13]:
# Elbow Method to find optimal k

X = df[df['hour'] == 18][['start_lat', 'start_lng']].values
max_k = 12 # visually it should be 6-9 but let's give a chance up to 12
       
# Iterate 
error = [] 
for i in range(1, max_k+1):
    if len(X) >= i:
        model = cluster.KMeans(n_clusters=i, init='k-means++', max_iter=300, n_init=10, random_state=RANDOM_STATE)
        model.fit(X)
        error.append(model.inertia_)

# Best k 
error_diff = [i for i in np.diff(error, 2)] # find min diff between errors when it stops converging to 0 
best_k = error_diff.index(min(error_diff)) # and its index
                                                                             
# Plot
fig, ax = plt.subplots()
ax.plot(range(1, len(error)+1), error)
ax.axvline(best_k, ls='--', color="red", label='k = '+str(best_k))
ax.set(title='The Elbow Method', xlabel='Number of clusters', 
       ylabel='Error')
ax.legend()
ax.grid(True)
plt.show()
In [14]:
# Cluster the data
k_means_model = cluster.KMeans(n_clusters=best_k, init='k-means++', max_iter=300, n_init=10, random_state=RANDOM_STATE)
k_means_model.fit(X)
Out[14]:
KMeans(n_clusters=9, n_init=10, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KMeans(n_clusters=9, n_init=10, random_state=42)
In [15]:
# Make predictions for the start and end coordinates
df['start_zone'] =  k_means_model.predict(df[['start_lat', 'start_lng']].values)
df['end_zone'] = k_means_model.predict(df[['end_lat', 'end_lng']].values)
In [16]:
# Find cluster centroids to plot them further
zone_centers = {}
for i in sorted(df['start_zone'].unique()):
    zone_centers[i] = list(k_means_model.cluster_centers_[i])
In [17]:
# Plot orders with the predicted zones and its centers

def plot_clustered_coordinates(df:pd.DataFrame, lat_column:str, lng_column:str, zone_column:str) -> go.Figure:
    """This function plots the orders with start/end coordinates 
    within clusters and its centers"""
    
    df_plot = df.copy().sample(int(len(df)/100), random_state=RANDOM_STATE) # take a samle to plot it faster
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=df_plot[lng_column], y=df_plot[lat_column], mode='markers'
                             , marker=dict(color=df_plot[zone_column]), name=lat_column[:-4]+' points')) # orders
    
    for i in list(zone_centers.keys()):
        fig.add_trace(go.Scatter(x=np.array(zone_centers[i][1]), y=np.array(zone_centers[i][0])
                                 , mode='markers', marker=dict(color='black', size=10),name='zone '+str(i) + ' center')) # centers
        
    config = {'displayModeBar': False} # turn off display mode to save time
    fig.update_layout(paper_bgcolor='rgba(0,0,0,0)',
                      plot_bgcolor='rgba(0,0,0,0)') # white background 
    
    fig.update_xaxes(showline=True, gridcolor='grey') # add grid
    fig.update_yaxes(showline=True, gridcolor='grey')
    
    fig.show(config=config)
In [18]:
# Plot starting points
plot_clustered_coordinates(df, 'start_lat', 'start_lng', 'start_zone')
In [19]:
# Plot ending points
plot_clustered_coordinates(df, 'end_lat', 'end_lng', 'end_zone')
In [20]:
# The clusters(zones) are consistent between starting and ending points. The plot is following Tallin map outlines

2. Features engineering + y_true creation¶

In [21]:
# In the beginning the data was splitted into 30 min intervals. First, let's calculate the incoming amount of orders
# per start zone within each start time interval (demand). Then calculate the appox supply (# cars) that are currently
# in the zone. Supply will be calculated as the number of cars went to a particular zone in the previous 30 min interval

df['demand'] = 0
df['supply'] = 0
start_zone_demand = df.groupby(['start_time_bin', 'start_zone'])['demand'].count().reset_index()
end_zone_supply = df.groupby(['end_time_bin', 'end_zone'])['supply'].count().reset_index()

# Merge the grouped dfs(first time bin supply will not exist since we don't know the data before Mar, 1)
net_demand = start_zone_demand.merge(end_zone_supply, how='left', left_on=['start_time_bin', 'start_zone']
                                     , right_on=['end_time_bin', 'end_zone'])

# Drop first 30 min interval for now and fill the rest of NaN supply values with 0 (if any) 
net_demand = net_demand[net_demand['start_time_bin'] != df['start_time_bin'].min()]
net_demand['supply'] = net_demand['supply'].fillna(0)

# Calculate net demand column
net_demand['net_demand'] = net_demand['demand'] - net_demand['supply']
In [25]:
# Let's merge net_demand to the main df
df = df.merge(net_demand[['start_time_bin', 'start_zone', 'net_demand']], how='left', on=['start_time_bin', 'start_zone'])
df
Out[25]:
start_time start_lat start_lng end_lat end_lng ride_value date hour minute day_of_the_week start_time_bin end_time_bin distance start_zone end_zone demand supply net_demand
0 2022-03-01 00:00:07.936317 59.438142 24.728677 59.553133 24.802705 3.352000 2022-03-01 0 0 1 2022-03-01 00:00:00 2022-03-01 00:30:00 13.479290 6 5 0 0 NaN
1 2022-03-01 00:00:17.556188 59.443230 24.753330 59.391667 24.722047 1.495000 2022-03-01 0 0 1 2022-03-01 00:00:00 2022-03-01 00:30:00 6.012515 2 4 0 0 NaN
2 2022-03-01 00:00:20.355945 59.431849 24.768252 59.433692 24.728579 0.552750 2022-03-01 0 0 1 2022-03-01 00:00:00 2022-03-01 00:30:00 2.260878 2 6 0 0 NaN
3 2022-03-01 00:00:20.690881 59.439587 24.748874 59.452895 24.871234 1.737250 2022-03-01 0 0 1 2022-03-01 00:00:00 2022-03-01 00:30:00 7.098048 2 3 0 0 NaN
4 2022-03-01 00:00:25.804142 59.367751 24.645455 59.396104 24.800205 2.286750 2022-03-01 0 0 1 2022-03-01 00:00:00 2022-03-01 00:30:00 9.345522 0 1 0 0 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
623805 2022-03-28 23:59:19.601000 59.441576 24.739099 59.466716 24.832512 1.467973 2022-03-28 23 59 0 2022-03-28 23:30:00 2022-03-29 00:00:00 5.992805 6 5 0 0 25.0
623806 2022-03-28 23:59:19.757876 59.434211 24.560813 59.436173 24.831474 3.755750 2022-03-28 23 59 0 2022-03-28 23:30:00 2022-03-29 00:00:00 15.361170 7 1 0 0 -14.0
623807 2022-03-28 23:59:22.883111 59.430173 24.773986 59.435510 24.819229 0.644750 2022-03-28 23 59 0 2022-03-28 23:30:00 2022-03-29 00:00:00 2.635571 2 1 0 0 14.0
623808 2022-03-28 23:59:42.778225 59.434750 24.746043 59.445609 24.697914 0.732500 2022-03-28 23 59 0 2022-03-28 23:30:00 2022-03-29 00:00:00 2.986737 6 8 0 0 25.0
623809 2022-03-28 23:59:53.175658 59.434520 24.746352 59.490741 24.836812 2.002250 2022-03-28 23 59 0 2022-03-28 23:30:00 2022-03-29 00:00:00 8.095532 6 5 0 0 25.0

623810 rows × 18 columns

In [26]:
# Impute first net_demand interval with the average across start_zone, dow and same start time bin (hour=0, minute<30)
averages = df[(df['hour'] == 0) & (df['minute'] < 30) & 
              (df['day_of_the_week'] == 1)].groupby('start_zone')['net_demand'].mean()
for i in list(df[df['start_time_bin'] == df['start_time_bin'].min()].index):
    df.loc[i, 'net_demand'] = int(averages[df.loc[i, 'start_zone']])
In [27]:
df.isna().sum().sum()
Out[27]:
0

Create y_true¶

In [28]:
# The idea is to maximaze potential gain and reccomend drivers to be sent to the zone where the gain is max 
# within 30 min interval given the timestamp as only one input.

# First of all, let's create a monetary value per km. The value per km is usually adjusted by time which is our input.
df['value_per_km'] = df['ride_value']/df['distance']

# Now let's multiply the net_demand (cars) we calculated by the potential value per km. 
# Thus, we monetize the net demand. Negative values will stay because they might have stronger signal rather than 0s.
df['net_demand_value'] = df['value_per_km']*df['net_demand']
In [29]:
# Now calculate average net demand values across start zones and 30 min intervals. 
mean_values = df.groupby(['start_time_bin', 'start_zone'])['net_demand_value'].mean().reset_index()
mean_values
Out[29]:
start_time_bin start_zone net_demand_value
0 2022-03-01 00:00:00 0 -0.984442
1 2022-03-01 00:00:00 1 -6.951163
2 2022-03-01 00:00:00 2 18.466528
3 2022-03-01 00:00:00 3 -1.472029
4 2022-03-01 00:00:00 4 -0.740112
... ... ... ...
12061 2022-03-28 23:30:00 4 -5.485491
12062 2022-03-28 23:30:00 5 -1.481468
12063 2022-03-28 23:30:00 6 6.476252
12064 2022-03-28 23:30:00 7 -3.438783
12065 2022-03-28 23:30:00 8 -2.705890

12066 rows × 3 columns

In [30]:
# And then find the max values for each time interval
max_values = pd.DataFrame()
max_values['start_time_bin'] = mean_values['start_time_bin'].unique()

for time_bin in mean_values['start_time_bin'].unique():
    max_value = mean_values[mean_values['start_time_bin'] == time_bin]['net_demand_value'].max()
    
    max_values.loc[max_values['start_time_bin'] == time_bin, 'start_zone'] = mean_values[(mean_values['start_time_bin'] == time_bin) 
                                                                                       & (mean_values['net_demand_value'] == max_value)]['start_zone'].values[0]
In [31]:
max_values
Out[31]:
start_time_bin start_zone
0 2022-03-01 00:00:00 2.0
1 2022-03-01 00:30:00 2.0
2 2022-03-01 01:00:00 2.0
3 2022-03-01 01:30:00 6.0
4 2022-03-01 02:00:00 6.0
... ... ...
1339 2022-03-28 21:30:00 6.0
1340 2022-03-28 22:00:00 2.0
1341 2022-03-28 22:30:00 2.0
1342 2022-03-28 23:00:00 6.0
1343 2022-03-28 23:30:00 6.0

1344 rows × 2 columns

In [32]:
# Now apply the target start_zone to each order (merge is always faster than lambda)
max_values = max_values.rename(columns={'start_zone': 'y_true'})
df = df.merge(max_values, how='left', on='start_time_bin')
In [33]:
# Finally, let's add the average value per km for each combination of hour, minute and day of the week. And keep 
# separate table for it which will be called to get predictions. Don't take zone since we will not know it from input
historical_mean_value_per_km = df.groupby(['hour', 'minute', 'day_of_the_week'])['value_per_km'].mean().reset_index()
historical_mean_value_per_km
Out[33]:
hour minute day_of_the_week value_per_km
0 0 0 0 0.249074
1 0 0 1 0.246736
2 0 0 2 0.249105
3 0 0 3 0.251990
4 0 0 4 0.246589
... ... ... ... ...
10075 23 59 2 0.252948
10076 23 59 3 0.252474
10077 23 59 4 0.272170
10078 23 59 5 0.250241
10079 23 59 6 0.277117

10080 rows × 4 columns

In [34]:
# Save historical_mean_value_per_km to use in the future after model development
joblib.dump(historical_mean_value_per_km, 'historical_mean_value_per_km.gz')
Out[34]:
['historical_mean_value_per_km.gz']
In [35]:
# Keep only the columns we need for further modelling. Day itself is not taken because we have the data within 1 month
# and it will not make a difference. Add historical value per km as well.
df = df[['start_time', 'hour', 'minute', 'day_of_the_week', 'y_true']].dropna()
df = df.merge(historical_mean_value_per_km, how='left', on=['hour', 'minute', 'day_of_the_week'])
df
Out[35]:
start_time hour minute day_of_the_week y_true value_per_km
0 2022-03-01 00:00:07.936317 0 0 1 2.0 0.246736
1 2022-03-01 00:00:17.556188 0 0 1 2.0 0.246736
2 2022-03-01 00:00:20.355945 0 0 1 2.0 0.246736
3 2022-03-01 00:00:20.690881 0 0 1 2.0 0.246736
4 2022-03-01 00:00:25.804142 0 0 1 2.0 0.246736
... ... ... ... ... ... ...
623805 2022-03-28 23:59:19.601000 23 59 0 6.0 0.262537
623806 2022-03-28 23:59:19.757876 23 59 0 6.0 0.262537
623807 2022-03-28 23:59:22.883111 23 59 0 6.0 0.262537
623808 2022-03-28 23:59:42.778225 23 59 0 6.0 0.262537
623809 2022-03-28 23:59:53.175658 23 59 0 6.0 0.262537

623810 rows × 6 columns

In [36]:
# Check the y_true counts to get an idea on how it's balanced
# Classes are imbalanced, weights will be added to the models
df['y_true'].value_counts()
Out[36]:
2.0    213436
6.0    210810
4.0     71252
8.0     39649
1.0     34322
0.0     27618
5.0     16315
3.0      6585
7.0      3823
Name: y_true, dtype: int64

3. Multiclass problem¶

In [37]:
# With the steps above we reduced the problem to the multiclass task. There are 9 classes (zones) and only one y_true 
# which maximizes the potential gain withing the given 30 minutes interval. The idea is to predict a zone which will
# fit the best for a customer, driver and the company with the given timestamp as an input.

Data preprocessing¶

In [38]:
features = list(df.drop(['start_time', 'y_true'], axis=1).columns)
In [39]:
# Hour, minute and day of the week can be considered as categorical variables. 
# Let's take a look at their distributions

def plot_feature_distribution(df:pd.DataFrame, feature:str) -> plt.Figure:
    """Plots quick seaborn distribution for the given feature"""
    
    plot_df = df.groupby(feature)['y_true'].count().reset_index()
    sns.histplot(df[feature], bins=10, label=feature)

    plt.show()
In [40]:
for feature in features:
    plot_feature_distribution(df, feature)
In [41]:
# Hour, minute, dow are categorical values. Value per km has lleft skewed distribution. Let's take a log.
df['value_per_km'] = np.log(df['value_per_km'])

Models¶

Data split for all models¶

In [42]:
# Make a sample to find the optimal hyperparameters faster
df_sample = df.sample(int(len(df)/10), random_state = RANDOM_STATE)
X_sample, y_sample = df_sample.drop(['start_time', 'y_true'], axis=1).values, df_sample['y_true'].values
X_sample_train, X_sample_test, y_sample_train, y_sample_test = train_test_split(X_sample, y_sample,
                                                                    test_size=0.3, random_state=RANDOM_STATE)
In [43]:
# Split with validation set
X, y = df.drop(['start_time', 'y_true'], axis=1).values, df['y_true'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.33, random_state=RANDOM_STATE) # val approx 10% of total
In [44]:
print(len(X_val))
print(len(X_test))
print(len(X_train))
61758
125385
436667

Feature importance¶

In [45]:
# Train simple model with default params
model = xgb.XGBClassifier()
model.fit(X_sample_train, y_sample_train, verbose=True)
Out[45]:
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=256, max_cat_threshold=64, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_parallel_tree=1, objective='multi:softprob',
              predictor='auto', ...)
In [46]:
# Explain the model's predictions using SHAP values
shap_values = shap.TreeExplainer(model).shap_values(X_sample_test)
shap.summary_plot(shap_values, X_sample_test)
/Users/victor.kovatsenko/.local/share/virtualenvs/Assignment-k2b9DKi7/lib/python3.10/site-packages/xgboost/core.py:122: UserWarning:

ntree_limit is deprecated, use `iteration_range` or model slicing instead.

In [47]:
# The features passed in the same order as they are in the features list. According to the SHAP values hour and day of
# the week are the most important. Minute is less which is logical because it has more unique values hence less separation
# power. But let's keep all the features for now since no one has close to 0 shap value
features
Out[47]:
['hour', 'minute', 'day_of_the_week', 'value_per_km']

Common class for all models to optimize params¶

In [48]:
class Model:
    """The class takes the model parameters which should be fixed and thos which should be optimized
       It also includes unified bayesian optimization object for hyperparameters.
       Fit method is not included becaus it has different setup for models"""
    
    def __init__(self, model_object:object, fixed_params:dict, pbounds:dict, eval_func:object):
        self.model_object = model_object
        self.fixed_params = fixed_params
        self.pbounds = pbounds
        self.eval_func = eval_func
        self.best_params = None
        
    # Optimizer
    def bayesian_optimizer(self) -> dict:
        """Optimizrer object creation"""
    
        optimizer = BayesianOptimization(
            f=self.eval_func,
            pbounds=self.pbounds,
            random_state=RANDOM_STATE
        )
        
        optimizer.maximize(init_points=10, n_iter=10)

        # Print the best hyperparameters found
        print(optimizer.max)
        best_params = optimizer.max['params']
        self.best_params = best_params
    
        return best_params
    
    def get_final_params(self) -> dict:
        params = {**self.best_params, **self.fixed_params}
        
        return params

XGB Classifier¶

In [49]:
# Make a Bayesian optimization with CV on 5 folds for train data to find the best params.
# It will save computational resources if to perform Bayesian optimization on a sample

def xgb_evaluate(learning_rate:float, max_depth:int, gamma:float, min_child_weight:float, subsample:float, 
                 colsample_bytree:float, reg_lambda:float, alpha:float) -> float:
    """The function evaluates the cross validated mean metric on a given params for XGBClassifier"""
    
    params = {
        "objective": "multi:softmax", # softmax is used to get fatser results
        "num_class": 9,
        "booster": "gbtree", # fix the booster to save time 
        "eval_metric": "merror", # multiclass classification error rate
        "learning_rate": learning_rate,
        "max_depth": int(max_depth), # should be int
        "gamma": gamma,
        "min_child_weight": min_child_weight,
        "subsample": subsample,
        "colsample_bytree": colsample_bytree,
        "reg_lambda": reg_lambda,
        "alpha": alpha,
        "random_state": RANDOM_STATE
    }

    # Train
    cv_results = xgb.cv(
        params,
        xgb.DMatrix(X_sample_train, y_sample_train),
        num_boost_round=100, # max rounds (restricted further by early stopping rounds)
        nfold=5,
        metrics=['merror'],
        early_stopping_rounds=10, # 10 rounds if no score improvement to reduce overfitting
        seed=RANDOM_STATE
    )

    return -1.0 * cv_results["test-merror-mean"].iloc[-1] # will maximize negative loss
In [50]:
# Search space
xgb_pbounds = {
    "learning_rate": (0.01, 0.5),
    "max_depth": (2, 10),
    "gamma": (0, 1),
    "min_child_weight": (1, 20),
    "subsample": (0.5, 1),
    "colsample_bytree": (0.5, 1),
    "reg_lambda": (0, 1),
    "alpha": (0, 1)
}

fixed_params = {
    'eval_metric': 'merror',
    'objective': 'multi:softmax',
    'num_class': 9,
    'booster': 'gbtree',
    'random_state': RANDOM_STATE
}
In [51]:
model = Model(xgb.XGBClassifier, fixed_params, xgb_pbounds, eval_func=xgb_evaluate)
In [52]:
best_params = model.bayesian_optimizer()
|   iter    |  target   |   alpha   | colsam... |   gamma   | learni... | max_depth | min_ch... | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| 1         | -0.4078   | 0.3745    | 0.9754    | 0.732     | 0.3033    | 3.248     | 3.964     | 0.05808   | 0.9331    |
| 2         | -0.3721   | 0.6011    | 0.854     | 0.02058   | 0.4853    | 8.66      | 5.034     | 0.1818    | 0.5917    |
| 3         | -0.3712   | 0.3042    | 0.7624    | 0.4319    | 0.1527    | 6.895     | 3.65      | 0.2921    | 0.6832    |
| 4         | -0.3722   | 0.4561    | 0.8926    | 0.1997    | 0.262     | 6.739     | 1.883     | 0.6075    | 0.5853    |
| 5         | -0.3785   | 0.06505   | 0.9744    | 0.9656    | 0.4061    | 4.437     | 2.856     | 0.6842    | 0.7201    |
| 6         | -0.3882   | 0.122     | 0.7476    | 0.03439   | 0.4556    | 4.07      | 13.59     | 0.3117    | 0.76      |
| 7         | -0.3747   | 0.5467    | 0.5924    | 0.9696    | 0.3898    | 9.516     | 18.0      | 0.5979    | 0.9609    |
| 8         | -0.3939   | 0.08849   | 0.598     | 0.04523   | 0.1694    | 5.109     | 6.156     | 0.8287    | 0.6784    |
| 9         | -0.4394   | 0.2809    | 0.7713    | 0.1409    | 0.4031    | 2.596     | 19.75     | 0.7722    | 0.5994    |
| 10        | -0.3707   | 0.005522  | 0.9077    | 0.7069    | 0.3672    | 8.17      | 2.407     | 0.3585    | 0.5579    |
| 11        | -0.3744   | 1.0       | 0.5       | 1.0       | 0.5       | 10.0      | 12.72     | 0.5933    | 1.0       |
| 12        | -0.3589   | 0.0       | 1.0       | 1.0       | 0.01      | 10.0      | 8.657     | 1.0       | 0.5       |
| 13        | -0.3669   | 1.0       | 1.0       | 0.0       | 0.5       | 8.002     | 9.9       | 1.0       | 0.5       |
| 14        | -0.3793   | 0.9179    | 0.5239    | 0.2792    | 0.4505    | 9.859     | 9.52      | 0.03162   | 0.5876    |
| 15        | -0.3819   | 0.0       | 1.0       | 1.0       | 0.01      | 7.311     | 11.67     | 1.0       | 0.5       |
| 16        | -0.3651   | 0.2006    | 0.922     | 0.9428    | 0.2736    | 8.747     | 7.215     | 0.9962    | 0.9609    |
| 17        | -0.4463   | 1.0       | 0.5       | 1.0       | 0.01      | 10.0      | 4.086     | 1.0       | 0.5       |
| 18        | -0.3685   | 0.0       | 1.0       | 1.0       | 0.01      | 8.376     | 8.862     | 1.0       | 0.5       |
| 19        | -0.3652   | 1.0       | 1.0       | 0.0       | 0.5       | 8.02      | 7.335     | 0.0       | 0.5       |
| 20        | -0.3597   | 0.0       | 1.0       | 0.0       | 0.01      | 10.0      | 15.31     | 0.0       | 1.0       |
=========================================================================================================================
{'target': -0.3588830807951887, 'params': {'alpha': 0.0, 'colsample_bytree': 1.0, 'gamma': 1.0, 'learning_rate': 0.01, 'max_depth': 10.0, 'min_child_weight': 8.657315733957876, 'reg_lambda': 1.0, 'subsample': 0.5}}
In [53]:
params = model.get_final_params()
params['max_depth'] = int(params['max_depth']) # correct type
In [54]:
# Create class weights, more weight to smaller classes
sample_weights_train = [1/(len(y_train[y_train == i])/len(y_train)) for i in sorted(set(y_train))]
sample_weights_val = [1/(len(y_val[y_val == i])/len(y_val)) for i in sorted(set(y_val))]

train_weights = [sample_weights_train[int(i)] for i in y_train]
val_weights = [sample_weights_val[int(i)] for i in y_val]
In [64]:
# Train the best model (unbalanced)
xgb_clf = xgb.XGBClassifier(**params)
xgb_clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=True)
/Users/victor.kovatsenko/.local/share/virtualenvs/Assignment-k2b9DKi7/lib/python3.10/site-packages/xgboost/sklearn.py:861: UserWarning:

`early_stopping_rounds` in `fit` method is deprecated for better compatibility with scikit-learn, use `early_stopping_rounds` in constructor or`set_params` instead.

[0]	validation_0-merror:0.35608
[1]	validation_0-merror:0.35490
[2]	validation_0-merror:0.35573
[3]	validation_0-merror:0.35531
[4]	validation_0-merror:0.35508
[5]	validation_0-merror:0.35442
[6]	validation_0-merror:0.35424
[7]	validation_0-merror:0.35408
[8]	validation_0-merror:0.35419
[9]	validation_0-merror:0.35408
[10]	validation_0-merror:0.35448
[11]	validation_0-merror:0.35425
[12]	validation_0-merror:0.35471
[13]	validation_0-merror:0.35467
[14]	validation_0-merror:0.35435
[15]	validation_0-merror:0.35476
[16]	validation_0-merror:0.35461
[17]	validation_0-merror:0.35425
Out[64]:
XGBClassifier(alpha=0.0, base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1.0,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='merror', feature_types=None, gamma=1.0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.01, max_bin=256,
              max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
              max_depth=10, max_leaves=0, min_child_weight=8.657315733957876,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_class=9, num_parallel_tree=1, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(alpha=0.0, base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1.0,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='merror', feature_types=None, gamma=1.0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.01, max_bin=256,
              max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
              max_depth=10, max_leaves=0, min_child_weight=8.657315733957876,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_class=9, num_parallel_tree=1, ...)
In [65]:
# Train the best model (balanced)
xgb_clf_balanced = xgb.XGBClassifier(**params)
xgb_clf_balanced.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, 
            sample_weight=train_weights,  
            sample_weight_eval_set=[val_weights], 
            verbose=True)
[0]	validation_0-merror:0.27920
[1]	validation_0-merror:0.27688
[2]	validation_0-merror:0.27264
[3]	validation_0-merror:0.27315
[4]	validation_0-merror:0.27328
[5]	validation_0-merror:0.27193
[6]	validation_0-merror:0.27182
[7]	validation_0-merror:0.27151
[8]	validation_0-merror:0.27205
[9]	validation_0-merror:0.27119
[10]	validation_0-merror:0.27008
[11]	validation_0-merror:0.27041
[12]	validation_0-merror:0.27011
[13]	validation_0-merror:0.27110
[14]	validation_0-merror:0.27082
[15]	validation_0-merror:0.27030
[16]	validation_0-merror:0.27040
[17]	validation_0-merror:0.26966
[18]	validation_0-merror:0.26857
[19]	validation_0-merror:0.26875
[20]	validation_0-merror:0.26850
[21]	validation_0-merror:0.26865
[22]	validation_0-merror:0.26859
[23]	validation_0-merror:0.27007
[24]	validation_0-merror:0.26955
[25]	validation_0-merror:0.26860
[26]	validation_0-merror:0.26877
[27]	validation_0-merror:0.26873
[28]	validation_0-merror:0.26860
[29]	validation_0-merror:0.26868
[30]	validation_0-merror:0.26865
Out[65]:
XGBClassifier(alpha=0.0, base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1.0,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='merror', feature_types=None, gamma=1.0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.01, max_bin=256,
              max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
              max_depth=10, max_leaves=0, min_child_weight=8.657315733957876,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_class=9, num_parallel_tree=1, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
XGBClassifier(alpha=0.0, base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1.0,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='merror', feature_types=None, gamma=1.0, gpu_id=-1,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', learning_rate=0.01, max_bin=256,
              max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
              max_depth=10, max_leaves=0, min_child_weight=8.657315733957876,
              missing=nan, monotone_constraints='()', n_estimators=100,
              n_jobs=0, num_class=9, num_parallel_tree=1, ...)
In [66]:
# Save classifiers to use in the future
joblib.dump(xgb_clf, 'xgb_clf.joblib')
joblib.dump(xgb_clf_balanced, 'xgb_clf_balanced.joblib')
Out[66]:
['xgb_clf_balanced.joblib']
In [67]:
# Predictions
xgb_train_pred = xgb_clf.predict(X_train)
xgb_test_pred = xgb_clf.predict(X_test)

xgb_train_pred_balanced = xgb_clf_balanced.predict(X_train)
xgb_test_pred_balanced = xgb_clf_balanced.predict(X_test)
In [68]:
# Train/test accuracy to check overfitting
print(f"Train accuracy: {accuracy_score(y_train, xgb_train_pred)}")
print(f"Test accuracy: {accuracy_score(y_test, xgb_test_pred)}")
Train accuracy: 0.6485033217531895
Test accuracy: 0.6451090640826255
In [69]:
# Get the other metrics on test
print(classification_report(y_test, xgb_test_pred))
              precision    recall  f1-score   support

         0.0       0.49      0.37      0.42      5511
         1.0       0.48      0.49      0.49      6879
         2.0       0.69      0.80      0.74     42998
         3.0       0.51      0.19      0.27      1344
         4.0       0.52      0.47      0.49     14331
         5.0       0.61      0.70      0.65      3277
         6.0       0.71      0.67      0.69     42303
         7.0       0.42      0.76      0.54       819
         8.0       0.53      0.38      0.44      7923

    accuracy                           0.65    125385
   macro avg       0.55      0.54      0.53    125385
weighted avg       0.64      0.65      0.64    125385

In [70]:
# Get the other metrics on test
print('Balanced classes report:')
print(classification_report(y_test, xgb_test_pred_balanced))
Balanced classes report:
              precision    recall  f1-score   support

         0.0       0.33      0.85      0.47      5511
         1.0       0.36      0.77      0.49      6879
         2.0       0.79      0.57      0.66     42998
         3.0       0.26      1.00      0.41      1344
         4.0       0.43      0.42      0.43     14331
         5.0       0.44      0.93      0.60      3277
         6.0       0.79      0.44      0.56     42303
         7.0       0.37      0.99      0.54       819
         8.0       0.38      0.63      0.47      7923

    accuracy                           0.55    125385
   macro avg       0.46      0.73      0.51    125385
weighted avg       0.66      0.55      0.57    125385

Results comment:¶

Based on the metrics achieved, the model perofrmance is good enough for stand alone 9 classes model with 4 features, adding class weights allowed us to get more consistent metrics across diferent classes, but the average accuracy was 18% lower than for the model without weights. The unbalanced model predicts better larger classes and worse the smaller ones. Since the idea is to build a base model and test it as real life experiment let's focus on larger classes (zones). Thus, keep it unbalanced.

Random_forest¶

In [71]:
# Define evaluation function for RF

def rf_evaluate(n_estimators:int, max_depth:int, min_samples_split:int, min_samples_leaf:int, 
                max_leaf_nodes:int) -> float:
    """The function evaluates the cross validated mean metric on a given params for RFClassifier"""
    
    params = {
        "criterion": 'gini',
        "class_weight": 'balanced',
        "n_estimators": int(n_estimators), 
        "max_depth": int(max_depth),
        "min_samples_split": int(min_samples_split),
        "min_samples_leaf": int(min_samples_leaf), 
        "max_leaf_nodes": int(max_leaf_nodes),
        "random_state": RANDOM_STATE
    }
    
    rf = RandomForestClassifier(**params)
    
    # CV
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE) # we have imbalanced y_true

    X,y = X_sample_train, y_sample_train
    accuracy_list = []

    for train_index, test_index in kfold.split(X, y):
        x_train_fold, x_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]
        rf.fit(x_train_fold, y_train_fold)
        preds = rf.predict(x_test_fold)
        accuracy_test = accuracy_score(preds, y_test_fold)
        accuracy_list.append(accuracy_test)

    score = np.array(accuracy_list).mean()

    return score
In [72]:
# Search space
rf_pbounds = {
    "n_estimators": (10, 100), 
    "max_depth": (1, 50),
    "min_samples_split": (2, 100),
    "min_samples_leaf": (1, 20), 
    "max_leaf_nodes": (2, 10)
    }

rf_fixed_params = {
    "criterion": 'gini',
    "class_weight": 'balanced',
    "random_state": RANDOM_STATE
}
In [73]:
X_sample_train
Out[73]:
array([[16.        , 57.        ,  2.        , -1.36948645],
       [17.        , 49.        ,  4.        , -1.31136097],
       [13.        ,  7.        ,  6.        , -1.40023023],
       ...,
       [17.        , 38.        ,  1.        , -1.39894261],
       [ 9.        , 32.        ,  2.        , -1.35240732],
       [17.        , 32.        ,  1.        , -1.40047937]])
In [74]:
model = Model(RandomForestClassifier, rf_fixed_params, rf_pbounds, eval_func=rf_evaluate)
In [75]:
rf_best_params = model.bayesian_optimizer()
|   iter    |  target   | max_depth | max_le... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------
| 1         | 0.3288    | 19.35     | 9.606     | 14.91     | 60.67     | 24.04     |
| 2         | 0.2045    | 8.644     | 2.465     | 17.46     | 60.91     | 73.73     |
| 3         | 0.2184    | 2.009     | 9.759     | 16.82     | 22.81     | 26.36     |
| 4         | 0.2962    | 9.987     | 4.434     | 10.97     | 44.33     | 36.21     |
| 5         | 0.2234    | 30.98     | 3.116     | 6.551     | 37.9      | 51.05     |
| 6         | 0.2156    | 39.47     | 3.597     | 10.77     | 60.06     | 14.18     |
| 7         | 0.2478    | 30.77     | 3.364     | 2.236     | 94.99     | 96.91     |
| 8         | 0.2964    | 40.61     | 4.437     | 2.856     | 69.05     | 49.61     |
| 9         | 0.3132    | 6.98      | 5.961     | 1.653     | 91.11     | 33.29     |
| 10        | 0.2992    | 33.46     | 4.494     | 10.88     | 55.58     | 26.64     |
| 11        | 0.3281    | 19.97     | 7.529     | 5.903     | 83.54     | 64.88     |
| 12        | 0.2073    | 45.41     | 2.843     | 3.798     | 92.68     | 79.36     |
| 13        | 0.2075    | 29.62     | 2.382     | 1.68      | 12.24     | 30.99     |
| 14        | 0.3293    | 23.96     | 9.944     | 18.52     | 44.32     | 38.4      |
| 15        | 0.3133    | 33.71     | 5.403     | 11.23     | 56.75     | 25.68     |
| 16        | 0.3331    | 27.33     | 10.0      | 15.2      | 59.18     | 26.79     |
| 17        | 0.3368    | 19.29     | 10.0      | 19.67     | 52.12     | 31.4      |
| 18        | 0.3371    | 19.67     | 10.0      | 14.44     | 60.21     | 38.58     |
| 19        | 0.3363    | 27.57     | 10.0      | 7.867     | 69.82     | 32.18     |
| 20        | 0.3342    | 19.07     | 9.527     | 7.745     | 78.74     | 45.89     |
=====================================================================================
{'target': 0.3371274875212428, 'params': {'max_depth': 19.668820678019422, 'max_leaf_nodes': 10.0, 'min_samples_leaf': 14.440657566932162, 'min_samples_split': 60.208326675282756, 'n_estimators': 38.575079169590666}}
In [76]:
rf_params = model.get_final_params()
rf_params['max_depth'] = int(rf_params['max_depth'])
rf_params['n_estimators'] = int(rf_params['n_estimators'])
rf_params['min_samples_split'] = int(rf_params['min_samples_split'])
rf_params['min_samples_leaf'] = int(rf_params['min_samples_leaf'])
rf_params['max_leaf_nodes'] = int(rf_params['max_leaf_nodes'])
In [77]:
# Train the best model
rf_clf = RandomForestClassifier(**rf_params)
rf_clf.fit(X_train, y_train) # sample_weight option
Out[77]:
RandomForestClassifier(class_weight='balanced', max_depth=19, max_leaf_nodes=10,
                       min_samples_leaf=14, min_samples_split=60,
                       n_estimators=38, random_state=42)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(class_weight='balanced', max_depth=19, max_leaf_nodes=10,
                       min_samples_leaf=14, min_samples_split=60,
                       n_estimators=38, random_state=42)
In [78]:
# Save classifier to use in the future
joblib.dump(rf_clf, 'rf_clf.joblib')
Out[78]:
['rf_clf.joblib']
In [79]:
rf_train_pred = rf_clf.predict(X_train)
rf_test_pred = rf_clf.predict(X_test)
In [80]:
# Get the metrics on test. 
print(classification_report(y_test, rf_test_pred))
              precision    recall  f1-score   support

         0.0       0.18      0.19      0.19      5511
         1.0       0.17      0.71      0.27      6879
         2.0       0.65      0.45      0.53     42998
         3.0       0.05      0.50      0.10      1344
         4.0       0.76      0.03      0.06     14331
         5.0       0.31      0.89      0.46      3277
         6.0       0.55      0.15      0.23     42303
         7.0       0.15      0.95      0.26       819
         8.0       0.15      0.43      0.23      7923

    accuracy                           0.32    125385
   macro avg       0.33      0.48      0.26    125385
weighted avg       0.53      0.32      0.32    125385

Did not work out, results are not acceptable :) Let's try another one!¶

CatBoost Classifier¶

In [81]:
# Define evaluation function for CatBoost

def cb_evaluate(iterations:int, learning_rate:float, depth:int, l2_leaf_reg:float) -> float:
    """The function evaluates the cross validated mean metric on a given params for CatBoostClassifier"""
    
    params = {
        "loss_function": 'MultiClassOneVsAll', # specific CatBoost multiclass optimized for imbalanced datasets 
        "eval_metric": 'Accuracy', # estimate accuracy
        "bootstrap_type": 'Bayesian', # for regularization purposes only
        "iterations": int(iterations),
        "learning_rate": learning_rate, 
        "depth": int(depth),
        "l2_leaf_reg": l2_leaf_reg,
        "random_seed": RANDOM_STATE,
        "verbose": False
    }
    
    cb = CatBoostClassifier(**params)
    
    # CV
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE) # we have imbalanced y_true

    X,y = X_sample_train, y_sample_train
    accuracy_list = []

    for train_index, test_index in kfold.split(X, y):
        x_train_fold, x_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = y[train_index], y[test_index]
        cb.fit(x_train_fold, y_train_fold)
        preds = cb.predict(x_test_fold)
        accuracy_test = accuracy_score(preds, y_test_fold)
        accuracy_list.append(accuracy_test)

    score = np.array(accuracy_list).mean()

    return score
In [82]:
# Search space
cb_pbounds = {
    "iterations": (10, 200), 
    "learning_rate": (0.1, 0.4),
    "depth": (5, 10),
    "l2_leaf_reg": (0, 1)
    }

cb_fixed_params = {
    "loss_function": 'MultiClassOneVsAll', # specific CatBoost multiclass optimized for imbalanced datasets 
    "eval_metric": 'Accuracy', # estimate accuracy
    "bootstrap_type": 'Bayesian',
    "random_seed": RANDOM_STATE
}
In [83]:
model = Model(CatBoostClassifier, cb_fixed_params, cb_pbounds, eval_func=cb_evaluate)
In [84]:
cb_best_params = model.bayesian_optimizer()
|   iter    |  target   |   depth   | iterat... | l2_lea... | learni... |
-------------------------------------------------------------------------
| 1         | 0.6386    | 6.873     | 190.6     | 0.732     | 0.2796    |
| 2         | 0.5993    | 5.78      | 39.64     | 0.05808   | 0.3599    |
| 3         | 0.6309    | 8.006     | 144.5     | 0.02058   | 0.391     |
| 4         | 0.6252    | 9.162     | 50.34     | 0.1818    | 0.155     |
| 5         | 0.6307    | 6.521     | 109.7     | 0.4319    | 0.1874    |
| 6         | 0.6125    | 8.059     | 36.5      | 0.2921    | 0.2099    |
| 7         | 0.6376    | 7.28      | 159.2     | 0.1997    | 0.2543    |
| 8         | 0.5364    | 7.962     | 18.83     | 0.6075    | 0.1512    |
| 9         | 0.6304    | 5.325     | 190.3     | 0.9656    | 0.3425    |
| 10        | 0.5731    | 6.523     | 28.56     | 0.6842    | 0.232     |
| 11        | 0.6365    | 9.891     | 199.5     | 0.7747    | 0.2213    |
| 12        | 0.6394    | 10.0      | 175.5     | 0.0       | 0.1       |
| 13        | 0.6282    | 5.0       | 126.5     | 1.0       | 0.2858    |
| 14        | 0.6285    | 10.0      | 67.68     | 0.4778    | 0.1       |
| 15        | 0.6346    | 7.975     | 90.25     | 0.09      | 0.2027    |
| 16        | 0.6228    | 5.155     | 79.03     | 0.004678  | 0.3341    |
| 17        | 0.6304    | 5.019     | 168.9     | 0.04158   | 0.2912    |
| 18        | 0.6394    | 9.924     | 99.73     | 0.7001    | 0.2378    |
| 19        | 0.6395    | 10.0      | 184.3     | 1.0       | 0.1       |
| 20        | 0.5604    | 5.027     | 59.82     | 0.9584    | 0.1205    |
=========================================================================
{'target': 0.6395136105448851, 'params': {'depth': 10.0, 'iterations': 184.26462548577697, 'l2_leaf_reg': 1.0, 'learning_rate': 0.1}}
In [85]:
cb_params = model.get_final_params()
cb_params['depth'] = int(cb_params['depth'])
cb_params['iterations'] = int(cb_params['iterations'])
In [ ]:
# Train the best model (output cleared in order not to overload html with 166 iterations)
cb_clf = CatBoostClassifier(**cb_params)
cb_clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=True) 
In [87]:
# Save classifier to use in the future
joblib.dump(cb_clf, 'cb_clf.joblib')
Out[87]:
['cb_clf.joblib']
In [88]:
cb_train_pred = cb_clf.predict(X_train)
cb_test_pred = cb_clf.predict(X_test)
In [89]:
# Train/test accuracy to check overfitting
print(f"Train accuracy: {accuracy_score(y_train, cb_train_pred)}")
print(f"Test accuracy: {accuracy_score(y_test, cb_test_pred)}")
Train accuracy: 0.6536788903214578
Test accuracy: 0.6523986122741955
In [90]:
# Get the metrics on test. 
print(classification_report(y_test, cb_test_pred))
              precision    recall  f1-score   support

         0.0       0.53      0.31      0.39      5511
         1.0       0.53      0.45      0.49      6879
         2.0       0.70      0.80      0.75     42998
         3.0       0.55      0.19      0.28      1344
         4.0       0.52      0.53      0.52     14331
         5.0       0.61      0.70      0.65      3277
         6.0       0.69      0.69      0.69     42303
         7.0       0.51      0.54      0.52       819
         8.0       0.59      0.36      0.45      7923

    accuracy                           0.65    125385
   macro avg       0.58      0.51      0.53    125385
weighted avg       0.65      0.65      0.64    125385

Results comment:¶

The results are similar to what we got using xgboost model. The model predicts better for larger classes and worse for smaller. As it was explained in xgboost model part, we're focusing on larger classes, so keep it unbalanced.

Ensemble¶

In [91]:
# The idea is to take predictions from 2 best models (xgboost, and catboost) and check if ensemble voting model 
# performs better than stand alone models
In [92]:
ensemble_model = VotingClassifier(estimators=[('xgb_clf', xgb_clf), ('cb_clf', cb_clf)], voting='soft')
In [ ]:
ensemble_model.fit(X_train, y_train) # output cleared for html render purposes
In [94]:
ensemble_model_train_pred = ensemble_model.predict(X_train)
ensemble_model_test_pred = ensemble_model.predict(X_test)
In [95]:
# Train/test accuracy to check overfitting
print(f"Train accuracy: {accuracy_score(y_train, ensemble_model_train_pred)}")
print(f"Test accuracy: {accuracy_score(y_test, ensemble_model_test_pred)}")
Train accuracy: 0.6563353768432237
Test accuracy: 0.6534752960880488
In [96]:
# Get the metrics on test. 
print(classification_report(y_test, ensemble_model_test_pred))
              precision    recall  f1-score   support

         0.0       0.52      0.33      0.40      5511
         1.0       0.52      0.46      0.49      6879
         2.0       0.70      0.79      0.75     42998
         3.0       0.53      0.19      0.28      1344
         4.0       0.52      0.53      0.53     14331
         5.0       0.61      0.70      0.65      3277
         6.0       0.69      0.69      0.69     42303
         7.0       0.47      0.63      0.54       819
         8.0       0.58      0.37      0.45      7923

    accuracy                           0.65    125385
   macro avg       0.57      0.52      0.53    125385
weighted avg       0.65      0.65      0.65    125385

In [97]:
# Save classifier to use in the future
joblib.dump(ensemble_model, 'ensemble_model.joblib')
Out[97]:
['ensemble_model.joblib']
In [98]:
# Results are similar to xgboost outcomes regardless 'hard'/'soft' voting parameter.
In [102]:
# Check predict proba for 1 input
probs = ensemble_model.predict_proba(X_test[0].reshape(1, -1)).tolist()[0]
sorted_probs = sorted(enumerate(probs), key=lambda x: x[1], reverse=True)[:3] # get 3 best zones and index
estimate = [i[0] for i in sorted_probs] # get indexes of 3 best zones in desc order
print(f'Recommended zones: {estimate}')
Recommended zones: [4, 6, 8]
Results summary: the models separately as well as ensembled one provide good performance results in terms of average accuracy without class balancing. The model usage may lead to the situation when drivers will not receive the recommendations for relatively smaller zones and they may not be 'covered' in terms of demand-supply balance. It may lead to customers' unsatisfaction and potential drain. Hence, 2 conclusions:¶
  1. The current base model is good to check the experiment feasibility. So, I would use the model for validating the experiment results.
  2. To implement the model as a new app feature for all drivers, the base model should be improved: we need to gather more variables and data (will explain in section 6), probably adjust y_true, and, depending on business priorities, balance it by applying sample class weights.

4. How to deploy?¶

The idea is to deploy the model as a microservice app. To do that we need, first of all, fix all dependencies and packages using for example pipfile from pipenv and don't forget to serialize all necessary objects (models, tables, scalers etc.). Then do the following steps:

  1. Create flask (or other framework) app which will serve our requests.
  2. Dockerize flask app as well as all serialized objects and dependencies.
  3. Deploy (push) the container, for example, on AWS Elastic Beanstalk, it will create an endpoint.
  4. The endpoint will return 3 best recommended zones in sorted order based on the predict_proba method appplied to the model. If probability of any of 3 recomended zones is empty it should not be communicated to a driver.
  5. Don't forget to test the endpoint by sending requests (in our case it will be a json which contains 1 or more timestamps). The request can be sent, for example, using curl.
In [103]:
# Here is an example of app.py file content:

# from flask import Flask, jsonify, request
# import joblib
# import numpy as np
# import ... some manually defined functions (if any)

# app = Flask(__name__)
# app.secret_key = 'zones_app'

# MODEL = joblib.load('ensemble_model.joblib')


# @app.route('/api', methods=['GET'])
# def api():
#     """Handle request and output model score in json format."""

#     # Handle empty requests.
#     if not request.json:
#         return jsonify({'error': 'no request received'})

#     # Parse request args into feature array for prediction.
#     x = parse_args(request.json)

#     # Predict on x_array and return JSON response.
#     probs = MODEL.predict_proba(x)
#     sorted_probs = sorted(enumerate(probs), key=lambda x: x[1], reverse=True)[:3] # get 3 best zones and index
#     estimate = [i[0] for i in sorted_probs] # get indexes
#     response = dict(RECOMMENDED_ZONES=estimate)
#     return jsonify(response)
    
# def parse_args(request_dict) -> np.array:
#     """Here json timestamp will be transformed into df, hour, minute, day of the week calculated
#     as well as historical value per km left join"""
#     df = ... # apply transforms, etc. for the data from request_dict
#     x = df.values # get array
#     return x # in predict proba acceptable format
    
# if __name__ == '__main__':
#     app.run(host='0.0.0.0', port=5000, debug=True)

5. How to communicate to drivers?¶

In order to communicate to a driver in polite and respectful way I would propose following steps:

  1. Make a short (video) training about the new coming feature and how to use it. Explain the potential benefits and its importance for the company.
  2. When a driver opens Bolt app for first time during a day we can send a push with a text like "Would like to get a recommended starting zone?" The push can be turned off in settings as well. The option can be also called during the working day by clicking small icon let's say in the right upper corner.
  3. If the driver clicks "Yes" a driver's device will send a timestamp automatically to our microservice. The timestamp will be processed, historical value per km added and recommendation of 3 best yones in descending order set to the driver as buttons.
  4. The driver should click on one of the 3 buttons with recommended zones (preferrably first as it has the highest probability). The app will automatically show the coordinates of the chosen zone centroid which we got during clusterization.

6. Which data I would use additionally?¶

The given dataset was lacking some of the important (in my opinion) data:

  1. Drivers' ids and their starting/current location. It would help to better estimate how many cars are currently in the specific zone and trace their movements around the working day.
  2. End time of the trip. It would be great to calculate average trip time for specific days, hours etc. With this info we don't need to make 30 min ride trip assumption but be more percise.
  3. Data regarding costs for both the company and drivers. It would help to better estimate the y_true.
  4. Access to Google maps or other APi to get more accurate distances.

7. How to validate the experiments?¶

  1. Implement the feature for a selected group of drivers and ask them not to communicate to other regarding the new app functionality and always follow the recommended zone instructions for specific time period (i.e. month) in the beginning of the day, call the option when there are no orders for at least 10 min.
  2. Select the same size drivers group for which the feature will not be implemented.
  3. Create metrics: I would propose to take "average time of waiting for new order", "number of orders per driver per day", "average company's profit per driver per day"
  4. Evaluate the metrics for 1 month and compare the results for 2 groups. If there are statistically significant improvements - implement the feature for all drivers and make a short training on how to use it.
In [ ]: